import pandas as pd
from scipy.stats import ttest_rel, wilcoxon, trim_mean
import numpy as np

# --- 1. 數據讀取和預處理 ---
def load_and_process_data(filepath):
    df = pd.read_csv(filepath)
    # 確保列名一致
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]
    return df

# --- 2. 穩健t檢驗 (Robust t-test) ---
def robust_ttest(x, y, trim_prop):
    n = len(x)
    df = n - 2 * int(trim_prop * n) - 1
    
    x_trimmed = trim_mean(x, trim_prop)
    y_trimmed = trim_mean(y, trim_prop)
    
    # 計算winsorized方差
    x_winsor = np.sort(x)
    y_winsor = np.sort(y)
    g = int(trim_prop * n)
    x_winsor[:g] = x_winsor[g]
    x_winsor[-g:] = x_winsor[-g-1]
    y_winsor[:g] = y_winsor[g]
    y_winsor[-g:] = y_winsor[-g-1]
    
    var_x = np.var(x_winsor, ddof=1)
    var_y = np.var(y_winsor, ddof=1)
    
    se = np.sqrt((var_x + var_y) / (n * (1 - 2 * trim_prop)**2))
    t_stat = (x_trimmed - y_trimmed) / se
    
    # p值計算
    from scipy.stats import t
    p_two_tailed = 2 * (1 - t.cdf(np.abs(t_stat), df))
    p_one_tailed = 1 - t.cdf(t_stat, df)
    
    return t_stat, df, p_one_tailed, p_two_tailed

# --- 3. 複合壓力指數 ---
def calculate_composite_index(df, measures, weights):
    df_z = df.copy()
    for measure in measures:
        df_z[measure] = (df[measure] - df[measure].mean()) / df[measure].std()
    
    df_z["composite_index"] = 0
    for measure, weight in weights.items():
        df_z["composite_index"] += df_z[measure] * weight
        
    return df_z

# --- 主程序 ---
if __name__ == "__main__":
    # 讀取數據
    data_path = 
    df_features = load_and_process_data("/home/ubuntu/extracted_features.csv")

    # 定義要分析的指標
    measures_to_analyze = {
        "bvp_amplitude": "BVP Amplitude",
        "heart_rate": "Heart Rate",
        "gsr_tonic": "GSR",
        "respiration_rate": "Respiration Rate"
    }

    # 分離兩個條件的數據
    with_subtitles = df_features[df_features["condition"] == "with_subtitles"]
    without_subtitles = df_features[df_features["condition"] == "without_subtitles"]

    # --- 主要分析: 穩健t檢驗 ---
    print("="*50)
    print("主要分析: 穩健t檢驗 (Robust t-test) 結果")
    print("="*50)
    results = []
    for col, name in measures_to_analyze.items():
        x = with_subtitles[col].values
        y = without_subtitles[col].values
        
        # 計算效應量
        d = (np.mean(x) - np.mean(y)) / np.sqrt((np.std(x, ddof=1)**2 + np.std(y, ddof=1)**2) / 2)
        
        # 執行穩健t檢驗 (20% trim)
        t_stat, df, p_one, _ = robust_ttest(x, y, 0.2)
        
        results.append({
            "Measure": name,
            "t_stat": t_stat,
            "df": df,
            "p_one_tailed": p_one,
            "cohen_d": d
        })

    df_robust = pd.DataFrame(results)
    print(df_robust.to_string())

    # --- 補充分析: 加權複合壓力指數 ---
    print("\n" + "="*50)
    print("補充分析: 加權複合壓力指數")
    print("="*50)

    # 根據效應量計算權重
    weights = {col: abs(res["cohen_d"]) for col, res in zip(measures_to_analyze.keys(), results)}
    total_weight = sum(weights.values())
    weights = {k: v / total_weight for k, v in weights.items()}

    print("\n各指標權重:")
    for name, w in zip(measures_to_analyze.values(), weights.values()):
        print(f"  {name}: {w:.4f}")

    # 計算複合指數
    df_composite = calculate_composite_index(df_features, list(measures_to_analyze.keys()), weights)

    # 提取複合指數進行t檢驗
    composite_with = df_composite[df_composite["condition"] == "with_subtitles"]["composite_index"].values
    composite_without = df_composite[df_composite["condition"] == "without_subtitles"]["composite_index"].values

    # 執行配對t檢驗
    t_stat_comp, p_two_comp = ttest_rel(composite_with, composite_without)
    p_one_comp = p_two_comp / 2
    d_comp = (np.mean(composite_with) - np.mean(composite_without)) / np.std(np.concatenate([composite_with, composite_without]))

    print("\n複合指數t檢驗結果:")
    print(f"  t(14) = {t_stat_comp:.3f}")
    print(f"  p (one-tailed) = {p_one_comp:.4f}")
    print(f"  Cohen's d = {d_comp:.3f}")
